n_pop <- 1e6
X <- cbind(
1,
rnorm(n_pop), rnorm(n_pop), rnorm(n_pop)
)
beta <- c(-1.5, log(1.5), log(0.1), log(3))
p <- plogis(X %*% beta)
y <- rbinom(n = n_pop, size = 1, prob = p)
df <- data.frame(y, X[, -1], p)
head(df)
## y X1 X2 X3 p
## 1 1 -0.67141163 -1.9508952 -1.1074067 0.81805985
## 2 0 -0.63731704 0.1283204 0.1669023 0.13348185
## 3 0 0.27256001 0.4036060 -0.3696718 0.06151709
## 4 0 1.44706901 0.8588238 -0.4340875 0.03332119
## 5 1 -0.03250427 -1.4420543 0.7990098 0.93614021
## 6 1 0.27473031 -0.1656333 0.4881297 0.38438989
Two models: one perfect and one a little bit overfit.
m1 <- \(x) as.vector(
plogis(
as.matrix(cbind(1, x[, stringr::str_detect(colnames(x), "^X")])) %*% beta
)
)
beta2 <- c(-2.8, log(2), log(0.005), log(5))
m2 <- \(x) as.vector(
plogis(
as.matrix(cbind(1, x[, stringr::str_detect(colnames(x), "^X")])) %*% beta2
)
)
ix <- sample(1:n_pop, 1e5)
.df <- df[ix,]
logit <- \(x) log(x/(1-x))
.df$p_hat1 <- m1(.df)
.df$lp1 <- logit(.df$p_hat1)
.df$p_hat2 <- m2(.df)
.df$lp2 <- logit(.df$p_hat2)
cal1 <- glm(y ~ rms::rcs(lp1, 5), data = .df, family = binomial)
cal2 <- glm(y ~ rms::rcs(lp2, 5), data = .df, family = binomial)
.df$cal1_fitted <- predict(cal1, data = .df, type = "response")
.df$cal2_fitted <- predict(cal2, data = .df, type = "response")
For each threshold \(t\), a treatment error occurs when \(\hat{p}\) is on the wrong side of \(t\), i.e.
\[ \begin{align} &p < t \text{ | } \hat{p} > t \implies \text{overtreatment} \\ &p > t \text{ | } \hat{p} < t \implies \text{undertreatment} \end{align} \]
We will compute the probability of each event for models M1 and M2, using both the true underlying probability \(p\) and the calibration estimate \(\tilde{p} = \Pr(Y=1 | \hat{p} = x) \ , \text{ for } x \in (0,1)\).
library(tidyverse)
thresholds <- seq(0.02, 0.5, 0.02)
get_treatment_errors <- function(thr, df_val) {
tibble(
type = c(
"P(p > t | p_hat < t)",
"P(p < t | p_hat > t)",
"P(p > t | p_hat < t)",
"P(p < t | p_hat > t)"
),
ref_type = c(
"underlying p",
"underlying p",
"calibration",
"calibration"
),
type_label = c(
"undertreatment",
"overtreatment",
"undertreatment",
"overtreatment"
),
model1 = c(
mean(df_val$p[df_val$p_hat1 < thr] > thr),
mean(df_val$p[df_val$p_hat1 > thr] < thr),
mean(df_val$cal1_fitted[df_val$p_hat1 < thr] > thr),
mean(df_val$cal1_fitted[df_val$p_hat1 > thr] < thr)
),
model2 = c(
mean(df_val$p[df_val$p_hat2 < thr] > thr),
mean(df_val$p[df_val$p_hat2 > thr] < thr),
mean(df_val$cal2_fitted[df_val$p_hat2 < thr] > thr),
mean(df_val$cal2_fitted[df_val$p_hat2 > thr] < thr)
),
thr = rep(thr, 4)
)
}
d <- map_df(thresholds, get_treatment_errors, df_val = .df)
Plot everything to see if calibration \(\tilde{p}\) gives a nice (consistent) estimate of the over-/under-treatment probabilities.
theme_set(theme_minimal())
d %>%
na.omit() %>%
pivot_longer(cols = contains('model')) %>%
ggplot(aes(thr, value, color = name, shape = ref_type, size = ref_type)) +
geom_point() +
facet_wrap(~type) +
scale_shape_manual(values = c(19, 21)) +
scale_size_manual(values = c(1, 3)) +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent,
breaks = scales::pretty_breaks(10)) +
labs(x = "Decision threshold", y = NULL)
Apparently
\[ \Pr(p < t \text{ | } \hat{p} > t) \not\approx \Pr(\tilde{p} < t \text{ | } \hat{p} > t) \] If we expand the expression for the different \(p\)’s, the math gets a bit confusing (conditional probability of conditional probabilities).
Calibration looks at \(\hat{p}\), which hopefully approximates \(p\), but \(\hat{p}\) could be arbitrarily off (e.g. case-mix variation).
.df %>%
sample_n(2e4) %>%
ggplot(aes(p_hat2)) +
geom_point(aes(y = p), alpha = 0.3) +
geom_point(aes(y = cal2_fitted), color = "blue") +
geom_abline(linetype = 2, color = "red", size = 2) +
labs(x = "Predicted probability (model 2)",
y = "Underlyng p or calibration p")
How can we capture the spread around the calibration curve? Like a “predictive interval for calibration p”.
n_test <- 1e5
.df <- df[sample(1:n_pop, n_test), ]
.df$p_hat1 <- m1(.df)
.df$lp1 <- logit(.df$p_hat1)
.df$p_hat2 <- m2(.df)
.df$lp2 <- logit(.df$p_hat2)
test_fit <- glm(y ~ X1 + X2 + X3, data = .df, family = binomial)
.df$cal1_fitted <- predict(test_fit, type = 'response')
.df$cal2_fitted <- predict(test_fit, type = 'response')
d <- map_df(thresholds, get_treatment_errors, df_val = .df) %>%
mutate(
ref_type = ifelse(
ref_type == "calibration",
"test fit",
ref_type
)
)
d %>%
na.omit() %>%
pivot_longer(cols = contains('model')) %>%
ggplot(aes(thr, value, color = name, shape = ref_type, size = ref_type)) +
geom_point() +
facet_wrap(~type) +
scale_shape_manual(values = c(19, 21)) +
scale_size_manual(values = c(1, 3)) +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent,
breaks = scales::pretty_breaks(10)) +
labs(x = "Decision threshold", y = NULL)
the data set needed to estimate over-/under-treatment probabilities must be large enough for us to get a second estimate \(\hat{p_2} = \Pr(Y_{\text{test}} = 1 | X_{\text{test}})\), which is analogous to training another model on the test set (a bit like model revision).
.df %>%
sample_n(2e4) %>%
ggplot(aes(p_hat2)) +
geom_point(aes(y = p), alpha = 0.8) +
geom_point(aes(y = cal2_fitted), alpha = 0.5, color = "blue",
shape = 21, size = 2) +
geom_abline(linetype = 2, color = "red", size = 2) +
labs(x = "Predicted probability (model 2)",
y = "Underlyng p or calibration p")